import pandas as pd # Pandas is used for data manipulation
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.metrics.pairwise import haversine_distances
%matplotlib inline
import plotly.express as px
import folium
from folium import FeatureGroup, LayerControl, Map, Marker
from folium.plugins import HeatMap
from folium.plugins import TimestampedGeoJson
from folium.plugins import MarkerCluster
plt.style.use('seaborn-whitegrid')
nyc = pd.read_csv('train.csv', nrows = 50000,parse_dates=["pickup_datetime"])
# Let's see data of first few rows of the dataset
nyc.head(10)
nyc.dtypes
nyc.describe()
nyc = nyc[nyc['fare_amount']>2.50]
nyc = nyc[nyc['passenger_count']>0]
nyc.shape
#Now checking for missing data
nyc.isnull().sum()
sns.boxplot(nyc['fare_amount'])
From the boxplot we can see that there are a lot of outliers. We will be removing them in the below steps.
# Calculating the mean and the standard deviation of the 'fare_amount' in the dataset.
mean_df = np.mean(nyc.fare_amount)
std_df = np.std(nyc.fare_amount)
# Filtering the rows of from outliers
nyc = nyc[(nyc.fare_amount > (mean_df - 3*std_df)) & (nyc.fare_amount < (mean_df + 3*std_df))]
nyc.head()
New York city coordinates are (https://www.travelmath.com/cities/New+York,+NY):
We will be deleting all records where th pickup as well as dropoff longitude and latitude doesn't lie in between the above values.
nyc = nyc[((nyc['pickup_longitude'] >= -74.3)
& (nyc['pickup_longitude'] <= -72.9))
& ((nyc['dropoff_longitude'] >= -74.3)
& (nyc['dropoff_longitude'] <= -72.9))
& ((nyc['pickup_latitude'] >= 40.5)
& (nyc['pickup_latitude'] <= 41.8))
& ((nyc['dropoff_latitude'] >= 40.5)
& (nyc['dropoff_latitude'] <= 41.8))]
nyc.describe()
nyc.shape
sns.distplot(nyc.fare_amount)
Since, we need to calculate the distance between two points where their latitude and longitude points are given, we will be using the haversine formula. The haversine formula determines the great-circle distance between two points on a sphere given their longitudes and latitudes.
#To calculate the distance in miles we use a formula called "HAVERSINE FORMULA"
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295 # Pi/180
a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))
Adding a new distance column to dataframe storing the haversine distance of the corresponding trips
nyc['distance'] = distance(nyc.pickup_latitude, nyc.pickup_longitude, \
nyc.dropoff_latitude, nyc.dropoff_longitude)
nyc.distance.describe()
nyc['year'] = nyc.pickup_datetime.apply(lambda x : x.year)
nyc['month'] = nyc.pickup_datetime.apply(lambda x : x.month)
nyc['day'] = nyc.pickup_datetime.apply(lambda x : x.day)
nyc['time'] = nyc.pickup_datetime.apply(lambda x : x.time)
nyc['hour'] = nyc.time.apply(lambda x : x.hour)
nyc.head()
nyc_boroughs={
'manhattan':{
'min_lng':-74.0479,
'min_lat':40.6829,
'max_lng':-73.9067,
'max_lat':40.8820
},
'queens':{
'min_lng':-73.9630,
'min_lat':40.5431,
'max_lng':-73.7004,
'max_lat':40.8007
},
'brooklyn':{
'min_lng':-74.0421,
'min_lat':40.5707,
'max_lng':-73.8334,
'max_lat':40.7395
},
'bronx':{
'min_lng':-73.9339,
'min_lat':40.7855,
'max_lng':-73.7654,
'max_lat':40.9176
},
'staten_island':{
'min_lng':-74.2558,
'min_lat':40.4960,
'max_lng':-74.0522,
'max_lat':40.6490
}
}
def getBorough(lat,lng):
locs=nyc_boroughs.keys()
for loc in locs:
if lat>=nyc_boroughs[loc]['min_lat'] and lat<=nyc_boroughs[loc]['max_lat'] and lng>=nyc_boroughs[loc]['min_lng'] and lng<=nyc_boroughs[loc]['max_lng']:
return loc
return 'others'
nyc['pickup_borough']=nyc.apply(lambda row:getBorough(row['pickup_latitude'],row['pickup_longitude']),axis=1)
nyc['dropoff_borough']=nyc.apply(lambda row:getBorough(row['dropoff_latitude'],row['dropoff_longitude']),axis=1)
nyc.head()
pass_count = nyc.groupby('passenger_count').count()
plt.subplots(figsize=(15,8))
sns.barplot(pass_count.index,pass_count.key)
plt.xlabel('Passengers')
plt.ylabel('No. of Trips')
plt.title('Count of Passengers')
From the graph we can see that no. of trips having a passenger count of 1 exceeds 30000, which accounts for more than 60% of the trips recorded in the given dataset.
passenger_fare = nyc.groupby(['passenger_count']).mean()
fig, ax = plt.subplots(figsize=(17,10))
sns.barplot(passenger_fare.index, passenger_fare['fare_amount'])
plt.xlabel('Number of Passengers')
plt.ylabel('Average Fare Price')
plt.title('Average Fare Price for Number of Passengers')
plt.show()
print("Average ride cost in USD/ : {}".format(nyc.fare_amount.sum()/nyc["distance"].sum()))
#Scatterplot of distance-Fare
fig, axs = plt.subplots(1, 2, figsize=(16,6))
axs[0].scatter(nyc.distance, nyc.fare_amount, alpha=0.2)
axs[0].set_xlabel('distance mile')
axs[0].set_ylabel('fare $USD')
axs[0].set_title('All data')
# zoom in on part of data
idx = (nyc.distance < 15)
axs[1].scatter(nyc[idx].distance, nyc[idx].fare_amount, alpha=0.2)
axs[1].set_xlabel('distance mile')
axs[1].set_ylabel('fare $USD')
axs[1].set_title('Zoom in on distance < 15 mile, fare < $100');
fig, ax = plt.subplots(figsize=(17,10))
year_count = nyc.groupby('year').count()
sns.barplot(year_count.index,year_count.key)
plt.xlabel('Year')
plt.ylabel('Number of taxi trips')
plt.title('No. of taxi trips per year')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
avg_fare_years = nyc.groupby('year').mean()
avg_fare_years.head()
sns.barplot(avg_fare_years.index,avg_fare_years.fare_amount)
plt.xlabel('Year')
plt.ylabel('Avg. Fare Amount')
plt.title('Avg. Fare Amount vs Year')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
month_count = nyc.groupby('month').count()
sns.barplot(month_count.index,month_count.key)
plt.xlabel('Month')
plt.ylabel('Number of taxi trips')
plt.title('No. of taxi trips per month')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
month_mean = nyc.groupby('month').mean()
sns.barplot(month_mean.index,month_mean.fare_amount)
plt.xlabel('Month')
plt.ylabel('Avg. Fare Price')
plt.title('Avg. fare price per month')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
day_count = nyc.groupby('day').count()
sns.barplot(day_count.index,day_count.key)
plt.xlabel('Day')
plt.ylabel('Number of taxi trips')
plt.title('No. of taxi trips per day')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
day_mean = nyc.groupby('day').mean()
sns.barplot(day_mean.index,day_mean.fare_amount)
plt.xlabel('Day')
plt.ylabel('Avg. fare price')
plt.title('Avg. fare price per day')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
hour_count = nyc.groupby('hour').count()
sns.barplot(hour_count.index,hour_count.key)
plt.xlabel('hour')
plt.ylabel('Number of taxi trips')
plt.title('No. of taxi trips per hour')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
hour_fare = nyc.groupby('hour').mean()
sns.barplot(hour_fare.index,hour_fare.fare_amount)
plt.xlabel('hour')
plt.ylabel('Avg. fare price')
plt.title('Avg. fare price')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
dropborofare = nyc.groupby('dropoff_borough').mean()
sns.barplot(dropborofare.index,dropborofare.fare_amount)
plt.xlabel('Drop off Borough')
plt.ylabel('Avg. Fare price')
plt.title('Avg. fare price in each dropoff borough')
plt.show()
fig, ax = plt.subplots(figsize=(17,10))
pickborofare = nyc.groupby('pickup_borough').mean()
sns.barplot(pickborofare.index,pickborofare.fare_amount)
plt.xlabel('pick up Borough')
plt.ylabel('Avg. Fare price')
plt.title('Avg. fare price in each pickup borough')
plt.show()
import plotly
import chart_studio.plotly as py
import plotly.offline as offline
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import cufflinks as cf
from plotly.graph_objs import Scatter, Figure, Layout
cf.set_config_file(offline=True)
pickuplocation = [go.Scattermapbox(
lat= nyc['pickup_latitude'] ,
lon= nyc['pickup_longitude'],
customdata = nyc['key'],
mode='markers',
marker=dict(
size= 5,
color = 'red',
opacity = .2,
),
)]
layoutpan = go.Layout(autosize=False,
mapbox= dict(accesstoken="pk.eyJ1Ijoic2hhejEzIiwiYSI6ImNqYXA3NjhmeDR4d3Iyd2w5M2phM3E2djQifQ.yyxsAzT94VGYYEEOhxy87w",
bearing=10,
pitch=10,
zoom=13,
center= dict(
lat=40.721319,
lon=-73.987130),
style= "mapbox://styles/mapbox/streets-v11"),
width=800,
height=700, title = " Customer Pickup Visualization in NYC")
figure = dict(data=pickuplocation, layout=layoutpan)
iplot(figure)
#Now we visualize the dropoff locations of customers in NYC
dropofflocation = [go.Scattermapbox(
lat= nyc['dropoff_latitude'] ,
lon= nyc['dropoff_longitude'],
customdata = nyc['key'],
mode='markers',
marker=dict(
size= 5,
color = 'green',
opacity = .2,
),
)]
layoutpan = go.Layout(autosize=False,
mapbox= dict(accesstoken="pk.eyJ1Ijoic2hhejEzIiwiYSI6ImNqYXA3NjhmeDR4d3Iyd2w5M2phM3E2djQifQ.yyxsAzT94VGYYEEOhxy87w",
bearing=10,
pitch=5,
zoom=10,
center= dict(
lat=40.721319,
lon=-73.987130),
style= "mapbox://styles/mapbox/streets-v11"),
width=900,
height=700, title = "Customer Dropoff Visualization in NYC")
figure = dict(data=dropofflocation, layout=layoutpan)
iplot(figure)
plt.figure(figsize=(16,10))
plt.title("Distribution of Fare Amount Across Buroughs")
i=1
for key in nyc_boroughs.keys():
plt.subplot(3,2,i)
sns.kdeplot(np.log(nyc.loc[nyc['pickup_borough']==key,'fare_amount'].values),label='Pickup '+ key)
sns.kdeplot(np.log(nyc.loc[nyc['dropoff_borough']==key,'fare_amount'].values),label='Dropoff'+ key).set_title("Fare Amount (log scale) for "+key)
i=i+1
There is a significant difference in pickups and dropoffs fare amount for each burough except Manhattan.
plt.figure(figsize=(24,15))
plt.title("Distribution of Trip Distances Across Buroughs")
i=1
for key in nyc_boroughs.keys():
plt.subplot(3,2,i)
sns.kdeplot(np.log(nyc.loc[nyc['pickup_borough']==key,'distance'].values),label='Pickup '+ key)
sns.kdeplot(np.log(nyc.loc[nyc['dropoff_borough']==key,'distance'].values),label='Dropoff'+ key).set_title("Trip Distance (log scale) for "+key)
i=i+1
Dropoffs to Bronx and Brooklyn are long trips.
We will be implementing Multiple Linear Regression, Decision Trees, Random Forest and Boosted Trees.
Splitting the nyc data to train data as well as validation data
# Labels are the values we want to predict
labels = np.array(nyc['fare_amount'])
# Remove the labels from the nyc
# axis 1 refers to the columns
features= nyc.drop(['fare_amount','key', 'pickup_datetime','time','pickup_borough','dropoff_borough'],axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_nyc, valid_nyc, train_labels, valid_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
# Looking at the shape of the training data and validation data
print('Training Features Shape:', train_nyc.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', valid_nyc.shape)
print('Testing Labels Shape:', valid_labels.shape)
# Importing the Linear Regression model from the sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lm = LinearRegression()
lm.fit(train_nyc,train_labels)
y_pred=np.round(lm.predict(valid_nyc),2)
lm_rmse=np.sqrt(mean_squared_error(y_pred, valid_labels))
lm_train_rmse=np.sqrt(mean_squared_error(lm.predict(train_nyc), train_labels))
lm_variance=abs(lm_train_rmse - lm_rmse)
print("Test RMSE for Linear Regression is ",lm_rmse)
print("Train RMSE for Linear Regression is ",lm_train_rmse)
print("Variance for Linear Regression is ",lm_variance)
# Importing the Random Forest from scikit learn package
from sklearn.ensemble import RandomForestRegressor
# Importing GridSearchCV which checks for the optimal n_estimator parameter
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
est = range(50,100,50)
params_to_test = {'n_estimators': est}
rf = RandomForestRegressor(random_state = 101)
grid_search = GridSearchCV(rf, param_grid=params_to_test, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(train_nyc, train_labels)
best_model = grid_search.best_estimator_
# Use the forest's predict method on the test data
predictions = best_model.predict(valid_nyc)
# Calculate the absolute errors
errors = abs(predictions - valid_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
rf_rmse=np.sqrt(mean_squared_error(predictions, valid_labels))
rf_train_rmse=np.sqrt(mean_squared_error(best_model.predict(train_nyc), train_labels))
rf_variance=abs(rf_train_rmse - rf_rmse)
print("Test RMSE for Random Forest Regression is ",rf_rmse)
print("Train RMSE for Random Forest Regression is ",rf_train_rmse)
print("Variance for Random Forest Regression is ",rf_variance)
from sklearn.ensemble import GradientBoostingRegressor
est = range(50,100,50)
params_to_test = {'n_estimators': est}
gb = GradientBoostingRegressor(learning_rate=1, max_depth=3, random_state = 1)
grid_search_gb = GridSearchCV(gb, param_grid=params_to_test, cv=10, scoring='neg_mean_squared_error')
grid_search_gb.fit(train_nyc,train_labels)
best_model_gb = grid_search_gb.best_estimator_
# Use the forest's predict method on the test data
predictions_gb = best_model_gb.predict(valid_nyc)
# Calculate the absolute errors
errors_gb = abs(predictions_gb - valid_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors_gb), 2), 'degrees.')
gb_rmse=np.sqrt(mean_squared_error(predictions_gb, valid_labels))
gb_train_rmse=np.sqrt(mean_squared_error(best_model_gb.predict(train_nyc), train_labels))
gb_variance=abs(gb_train_rmse - gb_rmse)
print("Test RMSE for Gradient Boost Regression is ",gb_rmse)
print("Train RMSE for Gradient Boost Regression is ",gb_train_rmse)
print("Variance for Gradient Boost Regression is ",gb_variance)
regression = pd.DataFrame({"regression": ['Multi Linear Regression','Random Forest', 'Gradient Boosting Regrssion'],
"rmse": [lm_rmse,rf_rmse,gb_rmse]},columns = ['regression','rmse'])
regression = regression.sort_values(by='rmse', ascending = False)
sns.barplot(regression['rmse'], regression['regression'], palette = 'Set2')
plt.xlabel("RMSE")
plt.ylabel('Type of Regression Model')
plt.title('Performance Evaluation of different Regressions')